# output:
#   BiocStyle::pdf_document:
#       toc: TRUE
# 
# knitr::opts_chunk$set(echo=FALSE, error=FALSE, warning=FALSE,message=FALSE)

# tables
# replace html with latex

# regulatedEvents %>%
#     slice_sample(n=10) %>%
#     kable(., "html") %>%                                                                                        
#     kableExtra::kable_styling("striped") %>%
#     kableExtra::scroll_box(height="300px", width = "94%")

1 Analysis setup

This report describes the quantification of KCl sensitive of RT-Stop sites in the in-vitro RT-Stop data sets. A detailed description of the initial RT-Stop site definition of KCl and NaCl is given in the reports RT_Stop_Sites_Kcl.Rmd and RT_Stop_Sites_NaCl.Rmd. We expect some of these peaks to change in their strength depending on the presence of NaCl or KCl. KCl is meant to stabilize G4 formation, therefore peaks should mostly go up when comparing to NaCl. Here we want to test which G4 peaks are sensitive for KCl and which are not.

1.1 Load libraries

### ============================================================================
### Load Packages
### ----------------------------------------------------------------------------
### 
library(rtracklayer)
library(GenomicRanges)
library(ggplot2)
library(AnnotationDbi)
library(dplyr)
library(reshape2)
library(UpSetR)
library(GenomicFeatures)
library(kableExtra)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(viridis)
library(ComplexHeatmap)
library(forcats)
library(ggtext)
library(patchwork)
library(ggforce)
library(ggpointdensity)
library(knitr)
library(tidyr)
library(ggsci)
library(DESeq2)
library(gghalves)
library(ggrastr)
library(ggnewscale)
library(BindingSiteFinder)
library(xlsx)
library(ggseqlogo)
library(stringr)
source("/Users/mirko/Projects/GeneralScripts/CustomThemes.R")

2 Merge KCl and NaCl peaks

Here I load the raw signal per construct as list and the related sets of peaks as GenomicRanges from RData files.

# Load RT-Stop data per construct
constructInfo = readRDS("../rds/constructInformation.rds")
constructInfo = constructInfo[!grepl("Spike",constructInfo$Region.ID),]
constructSignal = readRDS("../rds/RTstop.profiling.l.rds")

# Load KCl sites
load("../01_BindingSites/condition_kcl/mean_cl1/data/sitesReproducible.rda")
name = paste0(seqnames(sitesReproducible), "_", unlist(sapply(runLength(seqnames(sitesReproducible)), seq)))
names(sitesReproducible) = name
seqlevels(sitesReproducible) <- as.character(sprintf("%05d", 1:12000))
peaksKCL = sitesReproducible

# Load NaCl sites
load("../01_BindingSites/condition_nacl/data/sitesReproducible.rda")
name = paste0(seqnames(sitesReproducible), "_", unlist(sapply(runLength(seqnames(sitesReproducible)), seq)))
names(sitesReproducible) = name
seqlevels(sitesReproducible) <- as.character(sprintf("%05d", 1:12000))
peaksNACL = sitesReproducible

Here I combined both sets of peaks. To keep the desired width of 3nt I applied the merge and re-size routine on all peak midpoints. This essentially uses the combined coverage of all replicates in both conditions to decide on the center of any peak region that only partially overlaps in the two sets. In the end I annotate for each peak if it was seen in NaCl, KCl or both conditions.

# Resize bs to their center and combine over conditions
peaks = unique(c(resize(peaksKCL, fix = "center", width = 1),
                   resize(peaksNACL, fix = "center", width = 1)))

# Prepare BSF data object
meta = data.frame(id = 1:6, condition = c(rep("GTP_KCL",3), rep("GTP_NACL", 3)))
sgn = list(signalPlus = constructSignal[1:6])
names(sgn$signalPlus) = paste0(meta$id, "_", meta$condition)
sgn$signalMinus = constructSignal[1:6]
names(sgn$signalMinus) = paste0(meta$id, "_", meta$condition)
bds = BSFDataSet(ranges = peaks, meta = meta, signal = sgn, silent = TRUE)

# Final binding site settings
bdsMerge <- makeBindingSites(object = bds, bsSize = 3, minWidth = 0,
                        minCrosslinks = 2, minClSites = 1, centerIsSummit = FALSE)
df = myNumberFormat(getSummary(bdsMerge))
kable(df, "html", caption = "Merge and combine") %>%
  kableExtra::kable_styling("striped") %>%
  kableExtra::scroll_box(height="300px", width = "94%")
Table 1: Merge and combine
Option nRanges
inputRanges 46,326
mergeCrosslinkSites 42,271
minCrosslinks 42,271
minClSites 42,271
centerIsClSite 42,239
centerIsSummit NA
# kable(myNumberFormat(df), "latex", caption = "Merge and combine", booktabs = TRUE) 
rng = BindingSiteFinder::getRanges(bdsMerge)
r1 = resize(peaksKCL, fix = "center", width = 1)
r2 = resize(peaksNACL, fix = "center", width = 1)

df = data.frame(olKCL = countOverlaps(rng, r1),
                olNACL = countOverlaps(rng, r2))
m = make_comb_mat(df)

ha = HeatmapAnnotation(
  "Intersections" = anno_barplot(comb_size(m),
                                 border = FALSE,
                                 gp = gpar(fill = "#a6a6a6"),
                                 height = unit(6, "cm"))
)
ha2 = HeatmapAnnotation(
  "set sizes" = anno_barplot(set_size(m),
                             border = FALSE,
                             gp = gpar(fill = "#a6a6a6"),
                             width = unit(4, "cm")),
  which = "row"
)

ht = UpSet(m,
           comb_order = order(comb_size(m), decreasing = T),
           top_annotation = ha,
           right_annotation = ha2,
           comb_col = "#003399", bg_col = "white", pt_size = unit(.5, "cm") ,
           border = T, lwd = 2, bg_pt_col = "#a6a6a6"
)

ss = set_size(m)
cs = comb_size(m)
ht = draw(ht,  padding = unit(c(0, 0, 10, 0), "mm"))
od = column_order(ht)
decorate_annotation("Intersections", {
  grid.text(format(cs[od], big.mark = ".", decimal.mark = ","),
            x = seq_along(cs), y = unit(cs[od], "native") + unit(.1, "pt"), 
            default.units = "native", just = c("left", "bottom"), 
            gp = gpar(fontsize = 8, col = "black"), rot = 45)
})
Overlap between NaCl and KCl peaks. Overlaps were computed after resolving partial overlaps.

Figure 1: Overlap between NaCl and KCl peaks
Overlaps were computed after resolving partial overlaps.

# Annotate with condition support
mcols(rng) = cbind(mcols(rng), df)
peaks_all = rng

# Set peak names
name = paste0(seqnames(peaks_all), "_", unlist(sapply(runLength(seqnames(peaks_all)), seq)))
names(peaks_all) = name
seqlevels(peaks_all) = as.character(sprintf("%05d", 1:12000))

# Export as bed file
rtracklayer::export(rng, 
       "./data/peaks_all.bed",
       format = "BED", 
       trackLine = new("BasicTrackLine",
                       name = "GTP-KCl+NaCl peaks",
                       description = "Merged 3nt RTstop peaks, reproducible in all 3 replicates per condition, partial-overlap resolved.") )
save(rng, file = "./data/peaks_all.rda")

3 Construct epxression

Constructs with in general low counts were removed from the differential analysis, because for these we don’t have the count power to tell if they are sensitive to KCl or not. In order to do so, counts per construct were normalized to library size with DESeq2 and only constructs with at least 100 normalized counts from all 6 samples were retained.

# Select summed counts per construct
m = sapply(constructSignal, sum) %>%
  as.data.frame() %>% 
  dplyr::select(1:6)

# Normalize counts by library size
meta = data.frame(id = 1:6, condition = c(rep("GTP_KCL",3), rep("GTP_NACL", 3)))
dds = DESeqDataSetFromMatrix(countData = m, colData = meta, design = ~1)
dds = estimateSizeFactors(dds)
normCounts = counts(dds, normalized = TRUE)

# Plot count diagnostic
df = normCounts %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column() %>%
  pivot_longer(-rowname) %>% 
  group_by(name) %>% 
  mutate(rank = rank(value, ties.method = "first"))

ggplot(df, aes(x = rank, y = value)) +
  ggrastr::rasterise(geom_pointdensity(), dpi = 300) +
  facet_wrap(~name) +
  theme_nice() +
  geom_hline(yintercept = 100, linetype = "dashed") +
  scale_y_log10() +
  scale_color_viridis(option = "rocket") +
  theme(legend.position = "top") +
  guides(color = guide_colorbar(title.position = 'top', title.hjust = 0.5,
                                barwidth = unit(20, 'lines'), barheight = unit(.5, 'lines'))) +
  labs(x = "Rank",
       y = "Normalized counts")
Global expression. Counts were normalized for library size with DESeq. These were summed up per construct and sorted by their rank. Only constructs with at least 100 normalized counts in all 6 samples are considered sufficiently large expressed.

Figure 2: Global expression
Counts were normalized for library size with DESeq. These were summed up per construct and sorted by their rank. Only constructs with at least 100 normalized counts in all 6 samples are considered sufficiently large expressed.

# Select constructs above threshold
const_Expressed = as.data.frame(table(df$rowname[df$value >= 100]))
const_Expressed = const_Expressed$Var1[const_Expressed$Freq == 6]

# Exclude all constrcuts that are not above threshold
const_All = levels(seqnames(peaks_all))
const_notExpressed = const_All[!const_All %in% const_Expressed]

# Exclude all constructs with high enough expression only in the read-through region
# -> no peak was called on these
const_withPeak = unique(seqnames(peaks_all))
const_onlyReadThroughSignal = const_Expressed[!const_Expressed %in% const_withPeak]
const_ExpressedAndPeak = const_Expressed[!const_Expressed %in% const_onlyReadThroughSignal]
peaks_expressed = peaks_all[seqnames(peaks_all) %in% const_ExpressedAndPeak]

4 Differentially regulated peaks between NaCl and KCl

To compute fold-changes per peak for each construct between the NaCl and KCl condition I used a DESeq2 based approach. In order to remove the bias of constructs that strongly change in their expression between the conditions, I used a modified DESeq2 test.

4.1 Construct level change

Here I specifically tested for those constructs that significantly change less than a log2 fold-change of 1 (less than 2-fold), using the parameter altHypothesis = "lessAbs" and an lfcThreshold = 1 in the results function. This results in all constructs that are affected by a less than 2-fold global change between conditions.

# Get total counts per construct
m = sapply(constructSignal, sum) %>%
  as.data.frame() %>%
  dplyr::select(c(1:6))
m = m[rownames(m) %in% unique(seqnames(peaks_expressed)),]

# Run DESeq2
dds = DESeqDataSetFromMatrix(countData = m, colData = meta, design = ~condition)
dds = DESeq(dds)
resGlobalNot = results(dds, contrast = c("condition", "GTP_KCL", "GTP_NACL"),
                       altHypothesis = "lessAbs", lfcThreshold = 1, alpha = 0.05) %>%
  as.data.frame() %>% mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) %>%
  mutate(sigDir = ifelse(sig == FALSE & log2FoldChange > 0, "Up",
                         ifelse(sig == FALSE & log2FoldChange < 0, "Down", "Not"))) %>%
  mutate(sigDir = factor(sigDir, levels = c("Not", "Up", "Down")))

# Filter results
const_globalLarge = rownames(resGlobalNot[resGlobalNot$sig == FALSE,])
const_globalSmall = rownames(resGlobalNot[resGlobalNot$sig == TRUE,])
peaks_globalLarge = peaks_expressed[seqnames(peaks_expressed) %in% const_globalLarge]
peaks_globalSmall = peaks_expressed[seqnames(peaks_expressed) %in% const_globalSmall]
df = resGlobalNot %>%
  mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) 

dfN = df %>%
  group_by(sig) %>%
  tally()

p1 = ggplot(dfN, aes(x = "Construtcts", y = n, fill = sig, color = sig)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = c("#999999", "#D8D9CF")) +
  scale_color_manual(values = c("#4d4d4d", "#979a7e")) +
  theme_pub() +
  theme(legend.position = "top") +
  labs(
    x = "",
    y = "Fraction",
    fill = "Significant", 
    color = "Significant"
  ) +
  guides(color = guide_legend(override.aes = list(size = 1))) + 
  theme(legend.key.size = unit(0.5, 'cm'), legend.position = "top") +
  coord_flip()
  
p2 = ggplot(df, aes(x = log2FoldChange)) +
  annotate("rect", xmin = -1, xmax = 1, ymax = Inf, ymin = -Inf, fill = "#D8D9CF") +
  geom_histogram(bins = 50, fill = "#999999", color = "#4d4d4d") +
  theme_pub() +
  theme(aspect.ratio = 1) +
  geom_vline(xintercept = -1, linetype = "dashed") +
  geom_vline(xintercept = 1, linetype = "dashed") +
  labs(
    x = "Fold-change[log2]",
    y = "Count"
  ) 

p1 / p2 + patchwork::plot_layout(heights = c(0.1, 0.9))
Basic plots of constructs that do not change more than 2-fold globaly.

Figure 3: Basic plots of constructs that do not change more than 2-fold globaly

p1 = ggplot() +
  geom_point(data = df, aes(x = log2(baseMean), y = log2FoldChange, fill = sig, color = sig), shape = 21) +
  scale_fill_manual(values = c("#999999", "#D8D9CF")) +
  scale_color_manual(values = c("#4d4d4d", "#979a7e")) +
  theme_pub() +
  theme(legend.position = "top") +
  labs(
    x = "BaseMean [log2]",
    y = "Fold-change [log2]",
    fill = "Significant", 
    color = "Significant"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
  theme(legend.position = "top") +
  ylim(-3,3) +
  theme(aspect.ratio = 1) 

p2 = ggplot() +
  geom_point(data = df, aes(x = log2FoldChange, y = -log10(padj), fill = sig, color = sig), shape = 21) +
  scale_fill_manual(values = c("#999999", "#D8D9CF")) +
  scale_color_manual(values = c("#4d4d4d", "#979a7e")) +
  theme_pub() +
  theme(legend.position = "top") +
  labs(
    x = "Fold-change [log2]",
    y = "Adjusted P value [-log10]",
    fill = "Significant", 
    color = "Significant"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
  theme(legend.position = "top") +
  xlim(-3,3) +
  theme(aspect.ratio = 1) 

p1 + p2
MA and volcanoe plots of constructs that do not change more than 2-fold globaly.

Figure 4: MA and volcanoe plots of constructs that do not change more than 2-fold globaly

4.2 Peak level change

Next I run a second round of DESeq2 on the level of individual RT-Stop peaks. Here I now test for each peak if counts are different between NaCl and KCl using (contrast = c("condition", "GTP_KCL", "GTP_NACL")). Lastly, all peaks from constructs that change their expression level significantly are removed.

# Get crosslink counts per peak
bds = setRanges(bds, peaks_expressed)
cov = coverageOverRanges(bds, returnOptions = "merge_positions_keep_replicates")

# Prepare SE object
se = SummarizedExperiment(assays = list(counts = as.matrix(mcols(cov))), 
                          rowRanges = granges(cov), colData = meta)

# Run DESeq2
dds = DESeqDataSet(se, design = ~ condition)
dds = DESeq(dds)
resSimple = lfcShrink(dds, contrast = c("condition", "GTP_KCL", "GTP_NACL"), type = "ashr")
resSimple = as.data.frame(resSimple) %>% 
  mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) %>%
  mutate(sigDir = ifelse(sig == TRUE & log2FoldChange > 0, "Up",
                         ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
  mutate(sigDir = factor(sigDir, levels = c("Not", "Up", "Down")))
  
idx = match(rownames(resSimple), names(cov))
resSimple$constructID = as.character(seqnames(cov))[idx]

# Combine construct and peak results
resChange = resSimple[resSimple$constructID %in% const_globalSmall,]
df = resSimple %>%
  mutate(sig = ifelse(padj < 0.05, TRUE, FALSE)) %>%
  mutate(sigDir = ifelse(sig == TRUE & log2FoldChange > 0, "Up",
                         ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not")))  %>%
  mutate(sigDir = factor(sigDir, levels = c("Not", "Up", "Down")))

dfN = df %>%
  group_by(sigDir) %>%
  tally()

p1 = ggplot(dfN, aes(x = "Construtcts", y = n, fill = sigDir, color = sigDir)) +
  geom_col(position = "fill") +
  scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
  scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
  theme_pub() +
  theme(legend.position = "top") +
  labs(
    x = "",
    y = "Fraction",
    fill = "Significant", 
    color = "Significant"
  ) +
  guides(color = guide_legend(override.aes = list(size = 1))) + 
  theme(legend.key.size = unit(0.5, 'cm'), legend.position = "top") +
  coord_flip()
  
p2 = ggplot(df, aes(x = log2FoldChange, fill = sigDir, color = sigDir)) +
  geom_histogram(bins = 50, fill = "#999999", color = "#4d4d4d") +
  theme_pub() +
  theme(aspect.ratio = 1) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  labs(
    x = "Fold-change[log2]",
    y = "Count"
  )

p1 / p2 + patchwork::plot_layout(heights = c(0.1, 0.9))
Basic plots of peaks that change between conditions.

Figure 5: Basic plots of peaks that change between conditions

dfPlot = resChange %>%
  mutate(sig = ifelse(sig == TRUE & log2FoldChange > 0, "Up", ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
  mutate(sig = factor(sig, levels = c("Not", "Up", "Down"))) %>%
  arrange(desc(sig))
p1 = ggplot(dfPlot, aes(x = log2(baseMean), y = log2FoldChange, color = sig, fill = sig)) +
  geom_point(shape = 21) +
  scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
  scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
  theme_pub() +
  theme(legend.position = "top") +
  labs(
    y = "Fold-change [log2]",
    x = "BaseMean [log2]",
    fill = "Significant", 
    color = "Significant"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
  theme(legend.position = "top") +
  ylim(-8,8) +
  theme(aspect.ratio = 1) 

p2 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig, fill = sig)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  geom_point(shape = 21) +
  scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
  scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
  theme_pub() +
  theme(legend.position = "top") +
  labs(
    x = "Fold-change [log2]",
    y = "Adjusted P value [-log10]",
    fill = "Significant", 
    color = "Significant"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
  theme(legend.position = "top") +
  xlim(-8,8) +
  theme(aspect.ratio = 1) 

p1 + p2
MA and volcanoe plots of constructs that do not change more than 2-fold globaly.

Figure 6: MA and volcanoe plots of constructs that do not change more than 2-fold globaly

dfPlot = resChange %>%
  mutate(sig = ifelse(sig == TRUE, "Yes", "No")) %>%
  mutate(sig = factor(sig, levels = c("No", "Yes"))) %>%
  arrange(desc(sig))
p1 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  ggrastr::rasterise(geom_point(size = 1), dpi = 300) +
  scale_color_manual(values = c("#808080", pal_npg()(10)[3])) +
  theme_pub() +
  theme(legend.position = "top") +
  xlim(-8,8) +
  theme(aspect.ratio = 1) +
  labs(
    x = "Fold-change [log2]",
    y = "Adjusted P value [-log10]",
    title = "Version 01"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") 

dfPlot = resChange %>%
  mutate(sig = ifelse(sig == TRUE & log2FoldChange > 0, "Up", ifelse(sig == TRUE & log2FoldChange < 0, "Down", "Not"))) %>%
  mutate(sig = factor(sig, levels = c("Not", "Up", "Down"))) %>%
  arrange(desc(sig))
p2 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  ggrastr::rasterise(geom_point(size = 1), dpi = 300) +
  scale_color_manual(values = c("#999999", "#874C62", "#68b1a5")) +
  theme_pub() +
  theme(legend.position = "top") +
  xlim(-8,8) +
  theme(aspect.ratio = 1) +
  labs(
    x = "Fold-change [log2]",
    y = "Adjusted P value [-log10]",
    title = "Version 02"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") 

p3 = ggplot(dfPlot, aes(x = log2FoldChange, y = -log10(padj), color = sig, fill = sig)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  ggrastr::rasterise(geom_point(size = 1, shape = 21), dpi = 300) +
  scale_fill_manual(values = c("#999999", "#874C62", "#68b1a5")) +
  scale_color_manual(values = c("#4d4d4d", "#623747", "#2b544d")) +
  theme_pub() +
  theme(legend.position = "top") +
  xlim(-8,8) +
  theme(aspect.ratio = 1) +
  labs(
    x = "Fold-change [log2]",
    y = "Adjusted P value [-log10]",
    title = "Version 03"
  ) +
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") 
p1 + p2 + p3
Volcano selection for figures.

Figure 7: Volcano selection for figures

4.3 Construct classification

In the following every of the 12,000 constructs gets assigned to exactly one classification term, which are further grouped into three main categories. A construct gets assigned to term Not analyzed if it was not expressed high enough, had no peak called or if we observed a significant expression level change. All other constructs are classified as G4 containing or Not G4 containing based on the presence of a RT-Stop peak that significantly changes up or down between KCl and NaCl.

# 1) For all constructs 
# -> Classify expression status 
df1 = data.frame(constructID = const_notExpressed, status = "Not expressed")
df2 = data.frame(constructID = const_onlyReadThroughSignal, status = "No peak")
df3 = data.frame(constructID = const_globalLarge, status = "Global change")

# 2) For all "Expressed + small change" constructs
# -> Classify significance of constructs based on peaks
idx = match(names(peaks_globalSmall), rownames(resChange))
mcols(peaks_globalSmall) = cbind.data.frame(mcols(peaks_globalSmall), resChange[idx,])
peaks_globalSmall_sig = peaks_globalSmall[peaks_globalSmall$sig == TRUE]
const_globalSmall_sig = as.character(unique(seqnames(peaks_globalSmall_sig)))
const_globalSmall_sigNot = const_globalSmall[!const_globalSmall %in% const_globalSmall_sig]

df4 = data.frame(constructID = const_globalSmall_sigNot, status = "Not significant")

# 3) For all "Expressed + small change" and "Significant"
# -> Classify direction
resChange$dir = ifelse(resChange$log2FoldChange > 0, "Up", "Down")
constTable = as.data.frame(table(resChange[resChange$sig == TRUE,]$constructID, resChange[resChange$sig == TRUE,]$dir)) %>% 
  pivot_wider(names_from = c(Var2), values_from = Freq) %>% as.data.frame()
sig_up = constTable$Var1[constTable$Down == 0 & constTable$Up >= 1]
sig_down = constTable$Var1[constTable$Down >= 1 & constTable$Up == 0]
sig_mixed = constTable$Var1[!constTable$Var1 %in% sig_up]
sig_mixed = sig_mixed[!sig_mixed %in% sig_down]

df5 = data.frame(constructID = sig_up, status = "Significant-Up")
df6 = data.frame(constructID = sig_down, status = "Significant-Down")
df7 = data.frame(constructID = sig_mixed, status = "Significant-Mixed")

dfAssignment = rbind(df1,df2,df3,df4,df5,df6,df7)
dfAssignment$statusCombined = ifelse(dfAssignment$status == "Global change" | dfAssignment$status == "No peak" | dfAssignment$status == "Not expressed", "Not analysed", 
                              ifelse(dfAssignment$status  == "Not significant" | dfAssignment$status == "Significant-Down", "No G4","G4"))

df = dfAssignment %>% group_by(status, statusCombined) %>% tally()
ggplot(df, aes(x = statusCombined, y = n, fill = status)) +
  geom_col(position = "dodge") +
  scale_fill_d3() +
  theme_pub() +
  labs(
    x = "#Classification group",
    y = "#N",
    fill = "Classification term"
  ) +
  geom_text(aes(label = n), vjust=-0.4, size = 3, position = position_dodge(width = .9)) +
  guides(color = guide_legend(override.aes = list(size = 1))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") 
Construct classification in terms and groups.

Figure 8: Construct classification in terms and groups

Here I update the constructInfo dataframe, which is used in the shiniy app.

### 1) append construct info list by G4 status
idx = match(constructInfo$Construct.ID, dfAssignment$constructID)
constructInfo2 = cbind.data.frame(constructInfo, dfAssignment[idx,])

### 2) add information to peaks granges
idx = match(names(peaks_expressed), rownames(resSimple))
mcols(peaks_expressed) = cbind(mcols(peaks_expressed), resSimple[idx,])
idx = match(peaks_expressed$constructID, dfAssignment$constructID)
mcols(peaks_expressed) = cbind(mcols(peaks_expressed), dfAssignment[idx,])
peaks_expressed = peaks_expressed[peaks_expressed$constructID %in% const_globalSmall]
idx = match(peaks_expressed$constructID, constructInfo2$Construct.ID)
mcols(peaks_expressed)$RegionID = constructInfo2$Region.ID[idx]

### 3) append construct info list with number of peak info per category
# add number of peaks per construct to info table - all peaks
df1 = peaks_expressed %>% 
  mcols() %>% 
  as.data.frame() %>% 
  select(statusCombined, constructID) %>% 
  group_by(constructID) %>% 
  summarise(nPeaks = n())
idx = match(constructInfo2$constructID, df1$constructID)
constructInfo2$nPeaks = df1$nPeaks[idx]
# add number of peaks per construct to info table - significant peaks
df2 = peaks_expressed %>% 
  mcols() %>% 
  as.data.frame() %>% 
  filter(sig == TRUE) %>% 
  select(statusCombined, constructID) %>% 
  group_by(constructID) %>% 
  summarise(nPeaks = n())
idx = match(constructInfo2$constructID, df2$constructID)
constructInfo2$nPeaksSig = df2$nPeaks[idx]
# add number of peaks per construct to info table - significant peaks up
df3 = peaks_expressed %>% 
  mcols() %>% 
  as.data.frame() %>% 
  filter(sig == TRUE & log2FoldChange > 0) %>% 
  select(statusCombined, constructID) %>% 
  group_by(constructID) %>% 
  summarise(nPeaks = n())
idx = match(constructInfo2$constructID, df3$constructID)
constructInfo2$nPeaksSigUp = df3$nPeaks[idx]

### 4) save and export
saveRDS(constructInfo2, file = "./data/constructInfo2.rds")
saveRDS(peaks_expressed, file = "./data/peaks_expressed.rds")
head(constructInfo2)
##   Construct.ID          Region.ID                                   LSV.ID
## 1        00001      regulated.797 ENSG00000156976.15:s:186787776-186788256
## 2        00002      regulated.845   ENSG00000116350.16:t:29160375-29160563
## 3        00003 non-regulated.1087   ENSG00000178927.17:t:82446623-82446696
## 4        00004  non-regulated.720 ENSG00000126214.21:s:103687081-103687211
## 5        00005     regulated.1320   ENSG00000137501.17:s:85707429-85707531
## 6        00006      regulated.176   ENSG00000169592.14:s:30000929-30001040
##           Gene.ID Gene.Name    regulation Hill.cat region   type original.width
## 1 ENSG00000156976    EIF4A2     regulated        4    DE5     BS              5
## 2 ENSG00000116350     SRSF4     regulated        1    UI5 pG4+BS             49
## 3 ENSG00000178927     CYBC1 non-regulated     <NA>    UI3     BS              5
## 4 ENSG00000126214      KLC1 non-regulated     <NA>    DI3     BS              5
## 5 ENSG00000137501     SYTL2     regulated        1    DI5     BS              5
## 6 ENSG00000169592    INO80E     regulated        4    UI5     BS              5
##   strand                    coords         barcode
## 1      +  chr3:186789102-186789247 TGCCTACAGACGGTC
## 2      -    chr1:29181393-29181538 ATTACGAGGCTGGCA
## 3      -   chr17:82447563-82447708 AGTCTTTTATCAGCT
## 4      + chr14:103700609-103700754 AAGCTGGACTCATCC
## 5      -   chr11:85704360-85704505 TGTGGACCTGCATGT
## 6      +   chr16:30001055-30001200 TGGAAACGTAGGATG
##                                                                                                                                                                                                        Seq
## 1 TAATACGACTCACTATAGTTCTGATTCTTTTTCTTACACAGAATTGGCAGAGGGGGTCGATTTGGGAGGAAAGGTGTGGCTATAAACTTTGTTACTGAAGAAGACAAGAGGATTCTTCGTGACATTGAGACTTTCTACAATACTACAGTGGAGGAGATGCCCATTGCCTACAGACGGTCAGATCGGAAGAGCGGTTCAGC
## 2 TAATACGACTCACTATAGGCCTGGCCAGGCTCCGCCATAGTGGCGGCGCCTCGGGGGCGCGGGTAGGCCCCGGTTCCCACTGCGCCTGCGCGGGTGGGGGTGGTGGAGGCATAATGGGAGCTGTGCTGAGGCCCTGAGTGGGACGGGGAGAAGGAAAGCCCGAGATTACGAGGCTGGCAAGATCGGAAGAGCGGTTCAGC
## 3 TAATACGACTCACTATAGTCAGGGGCTGTGGCTCCTGGTCCCGCTGCACCAGGGAGGTGGGGTCTTACCGGGATCAATAGGCAGGCAGTCACTTTCTCTTCATAGGAATCTTGTCGATTGGCCTGGCTGCTGCCTACTACAGCGGAGGTAAAGCACCCCCCGCCAGTCTTTTATCAGCTAGATCGGAAGAGCGGTTCAGC
## 4 TAATACGACTCACTATAGGCCGCCTGCACGCCCTGAGTGAAACTCGTCTGTGCTTCATCTCCAGGGCGTCTCTGGCCGAGCCTCTTTTTGTGGAAAACGACAGCAGCAGCAGTGGCCTGGAAGACGCCACCGCTAACGTGAGTCCCACGGCCTGCAGCCCCAGGAAGCTGGACTCATCCAGATCGGAAGAGCGGTTCAGC
## 5 TAATACGACTCACTATAGAAAACTAGTTCCTGAAATCTTTTATATCCTCATGGGTTTTATGTCTGTGTGTACTGTGAATTACTAGGAGAGTTACAATTGTGAATTTGTCTTTTTCTCCTTTTAGTTTTATTATTTTTGATTCACTATATTTGAGATTGTCTTATTGTGGACCTGCATGTAGATCGGAAGAGCGGTTCAGC
## 6 TAATACGACTCACTATAGGGGATGGGAAGTGCTTGGGAGTAAGGTGGGCGTCATTTGGGCAGAAGGGCTGGGCCTCGGGGCAAGGCCAGCCCAACTTTGGGGACATCTGTCTGGGTGTGGCCCAAGTGTCCCGTGGAGGGTGTGAGTCGGGGCTGCCCCCTTCTTGGAAACGTAGGATGAGATCGGAAGAGCGGTTCAGC
##   constructID            status statusCombined nPeaks nPeaksSig nPeaksSigUp
## 1       00001           No peak   Not analysed     NA        NA          NA
## 2       00002     Global change   Not analysed     NA        NA          NA
## 3       00003    Significant-Up             G4      8         1           1
## 4       00004           No peak   Not analysed     NA        NA          NA
## 5       00005     Not expressed   Not analysed     NA        NA          NA
## 6       00006 Significant-Mixed             G4      8         8           7

Create an overview table with all numbers from the final assignment.

library(janitor)
df = constructInfo2 %>% 
  group_by(statusCombined) %>%
  summarise(nConst = n(),
            nRegion = length(unique(Region.ID)),
            nPeaks = sum(nPeaks, na.rm = T),
            nPeaksSig = sum(nPeaksSig, na.rm = T),
            nPeaksSigUp = sum(nPeaksSigUp, na.rm = T)) %>%
  adorn_totals() %>%
  tibble::column_to_rownames("statusCombined") 

df = df[c(3,2,1,4),]
colnames(df) = c("#Constructs", "#Rregions", "#Peaks", "#Peaks-sig", "#Peaks-sig-up")
df = myNumberFormat(df)

kable(df, "html", caption = "Construct, region and peak classifcation results.") %>%                                                                         
  kableExtra::kable_styling("striped") %>%
  kableExtra::scroll_box(height="300px", width = "94%")
Table 2: Construct, region and peak classifcation results.
#Constructs #Rregions #Peaks #Peaks-sig #Peaks-sig-up
Not analysed 4,427 1,919 0 0 0
No G4 3,997 1,473 16,484 1,524 0
G4 3,576 1,306 20,140 10,693 8,986
Total 12,000 4,698 36,624 12,217 8,986
# kable(myNumberFormat(df), caption = "Construct, region and peak classifcation results.")

5 Choose representative construct per region

Since each of the 3.000 genomic regions is represented 4 times by the constructs we need to merge constructs back to their original region. The region consensus heatmap and barcharts below shows how well the 4 construct replicates of each region agree in their classification.

m = constructInfo2 %>% 
  group_by(Region.ID, statusCombined) %>% 
  summarise(n = (n())) %>% 
  pivot_wider(names_from = statusCombined, values_from = n) %>% 
  replace(is.na(.), 0) %>% 
  as.data.frame() %>% 
  select(-c(Region.ID)) %>%
  relocate("G4", "No G4", "Not analysed")

m = m[order(m$`Not analysed`, decreasing = TRUE),]
m = m[order(m$`No G4`, decreasing = TRUE),]
m = m[order(m$`G4`, decreasing = TRUE),]

custom.col = structure(c("white", pal_material(palette = "blue-grey")(10)[3],
                         pal_material(palette = "blue-grey")(10)[5], pal_material(palette = "blue-grey")(10)[7],
                         pal_material(palette = "blue-grey")(10)[9]
                         ), names = c("0", "1", "2", "3", "4")) # black, red, green, blue
h = Heatmap(m, column_title = "Region consensus", name = "#construct support per region",
            cluster_columns = FALSE, cluster_rows = FALSE,
            show_row_names = FALSE, show_column_names = TRUE,
            show_row_dend = FALSE,
            border = TRUE, col = custom.col, 
            heatmap_legend_param = list(direction = "horizontal", nrow = 1),
            use_raster = FALSE
)

draw(h, heatmap_legend_side = "bottom", annotation_legend_side = "bottom")
Region and construct agreement based on classification type. Rows = regions; cols = classification; n = number of constructs.

Figure 9: Region and construct agreement based on classification type
Rows = regions; cols = classification; n = number of constructs.

colFill = c("#cccccc", "#BB6464", "#7897AB")
colCol = c("#a6a6a6", "#a04646", "#58788d")

df = m %>% 
  as.data.frame() %>%
  mutate(class = ifelse(G4 %in% c(4,3,2), "G4", ifelse(`No G4` %in% c(4,3,2) & !`G4` %in% c(4,3,2), "No G4", "Not Analyzed"))) %>%
  mutate(across(where(is.numeric), function(x){ifelse(x > 1, 1, x)})) %>%
  pivot_longer(-class) %>% group_by(class, name, value) %>% tally() %>% filter(value == 1) %>% ungroup() %>% group_by(class) %>%
  # filter(!(class == "G4" & name == "No G4")) %>%
  # filter(!(class == "No G4" & name == "G4")) %>%
  mutate(name = factor(name, levels = (c("Not analysed", "No G4", "G4"))),
         class = factor(class, level = rev(c("G4", "No G4", "Not Analyzed"))))

ggplot(df, aes(x = class, y = n, fill = name, color = name)) +
  geom_col(position = "dodge", size = 0.8) +
  scale_fill_manual(values = colFill) +
  scale_color_manual(values = colCol) +
  theme_pub() +
  guides(color = guide_legend(override.aes = list(size = 0.8))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
  labs(
    x = "Region class",
    y = "#Constructs",
    fill = "Construct class",
    color = "Construct class"
  ) + 
  coord_flip()
Number of constructs classified in other regions compared to those classifed as in the same region.

Figure 10: Number of constructs classified in other regions compared to those classifed as in the same region

For all 3,576 constructs where we observed a G4 (based on up-regulated G4 peaks) we now choose a single construct to represent the entire region from which it originated. This is done by always selecting the construct with the most counts if at least two constructs of this region were classified as up-regulated. The same definition is used to select No G4 containing constructs for the respective region, with the additional constraint that no construct was classified as G4 containing beforehand.

### ============================================================================
### Choose representative construct for region -> G4 Regions
### ----------------------------------------------------------------------------
### Rule: Take the construct with the highest counts; at least 2/4 constructs
### must be classified as G4
###
# Count crosslink sum per construct and match construct counts with region identifier
constCounts = sapply(constructSignal, sum) %>%
  as.data.frame() %>% 
  dplyr::select(1:6)
G4Const = constCounts[rownames(constCounts) %in% dfAssignment$constructID[dfAssignment$statusCombined == "G4"],] 
G4Const$Region.ID = constructInfo2$Region.ID[match(rownames(G4Const), constructInfo2$Construct.ID)]

### 1) keep only those regions that have at least two constructs classified as G4 containing
### -> present in the set G4Const
# diagnostic
df1 = G4Const %>% 
  group_by(Region.ID) %>% 
  summarise(n = n()) %>%
  group_by(n) %>%
  tally(name = "nn") %>%
  mutate(n = as.factor(n)) %>%
  mutate(type = "G4 Regions")

# select for G4 regions with at least 2 G4 constructs
keep = G4Const %>% 
  group_by(Region.ID) %>% 
  summarise(n = n()) %>% 
  filter(n >= 2) %>%
  pull(Region.ID)
G4Const = G4Const[G4Const$Region.ID %in% keep,]
# select the construct with the most counts as representative for the G4 region
keep = G4Const %>%
  mutate(sCounts = rowSums(across(contains("rep")))) %>%
  select(-contains("rep")) %>%
  tibble::rownames_to_column("constructID") %>%
  group_by(Region.ID) %>%
  filter(sCounts == max(sCounts)) %>%
  pull(constructID)
G4Const = G4Const[rownames(G4Const) %in% keep,]

### ============================================================================
### Choose representative construct for region -> No G4 Regions
### ----------------------------------------------------------------------------
### Rule: Take the construct with the highest counts; at least 2/4 constructs
### must be classified as G4; the region can not be already labeled as G4
###
noG4Const = constCounts[rownames(constCounts) %in% dfAssignment$constructID[dfAssignment$statusCombined == "No G4"],] 
noG4Const$Region.ID = constructInfo2$Region.ID[match(rownames(noG4Const), constructInfo2$Construct.ID)]
df2 = noG4Const %>% 
  filter(!Region.ID %in% G4Const$Region.ID) %>%
  group_by(Region.ID) %>% 
  summarise(n = n()) %>%
  group_by(n) %>%
  tally(name = "nn") %>%
  mutate(n = as.factor(n)) %>%
  mutate(type = "No G4 Regions")

# select for no G4 regions with at least to no G4 constructs 
# -> non of these are allowed to be classified as G4
keep = noG4Const %>% 
  filter(!Region.ID %in% G4Const$Region.ID) %>%
  group_by(Region.ID) %>% 
  summarise(n = n()) %>% 
  filter(n >= 2) %>%
  pull(Region.ID)
noG4Const = noG4Const[noG4Const$Region.ID %in% keep,]
# select the construct with the most counts as representative for the G4 region
keep = noG4Const %>%
  mutate(sCounts = rowSums(across(contains("rep")))) %>%
  select(-contains("rep")) %>%
  tibble::rownames_to_column("constructID") %>%
  group_by(Region.ID) %>%
  dplyr::slice(which.max(sCounts)) %>%
  pull(constructID)
noG4Const = noG4Const[rownames(noG4Const) %in% keep,]

### ============================================================================
### Make plot
### ----------------------------------------------------------------------------
###
df = rbind.data.frame(df1,df2)
dfNg4 = df %>%filter(n != 1 & type == "G4 Regions") %>% summarise(s = sum(nn))
dfNnog4 = df %>%filter(n != 1 & type == "No G4 Regions") %>% summarise(s = sum(nn))

colFill = c("#7897AB", "#BB6464")
colCol = c("#58788d", "#a04646")

ggplot(df, aes(x = n, y = nn, fill = type, color = type)) +
  geom_col(position = "dodge", size = 1) +
  theme_pub() +
  labs(
    x = "#Constructs",
    y = "#Regions",
    fill = "Region type",
    color = "Region type"
  ) +
  geom_text(aes(label = nn), vjust=-0.4, size = 3, position = position_dodge(width = .9)) +
  scale_fill_manual(values = colFill) +
  scale_color_manual(values = colCol) + 
  guides(color = guide_legend(override.aes = list(size = 1))) + 
  theme(legend.key.size = unit(1, 'cm'), legend.position = "top") +
  annotate("text", x = 1, y = 450, hjust = 0.3, size = 3,
           label = paste0("#G4 Regions: ", myNumberFormat(dfNg4$s),
                          "\n#No G4 Regions: ", myNumberFormat(dfNnog4$s)))
Region classification by construct agreement.

Figure 11: Region classification by construct agreement

The above definition results in 1,060 G4 and 1,109 No G4 merged construct regions.

6 Choose representative peak per construct

Here we assign a representative RT-Stop peak to each construct, since multiple peaks may exist. As representative always the strongest upregulated RT-Stop peak is chosen. Peak strength is determined by the ratio of RT-Stops within a given peak over the total RT-Stop of the construct. The total strength of a construct is then the sum of all peak strength divided by the number of peaks.

6.1 G4 containing constructs

# Calculate peak and construct strength 
constCountsNew = constCounts %>% tibble::rownames_to_column("ConstructID")
AllG4Peaks = cov[seqnames(cov) %in% rownames(G4Const)]
mcols(AllG4Peaks) = mcols(AllG4Peaks)[1:3]
AllG4Peaks = AllG4Peaks[names(AllG4Peaks) %in% rownames(resChange[resChange$dir == "Up",])]

dfPre = as.data.frame(AllG4Peaks) %>%
  dplyr::mutate(xlinksKCL = X1_GTP_KCL + X2_GTP_KCL + X3_GTP_KCL) %>%
  tibble::rownames_to_column("PeakID") %>%
  select(PeakID, seqnames, xlinksKCL) %>%
  rename("ConstructID" = "seqnames") %>%
  group_by(ConstructID) %>%
  left_join( y = constCountsNew, by = "ConstructID") %>% 
  mutate(xlinksKCL_total = GTP_KCL_rep1 + GTP_KCL_rep2 + GTP_KCL_rep3) %>%
  mutate(xlinksKCL_ratio = xlinksKCL / xlinksKCL_total) %>%
  select(c(PeakID, ConstructID, starts_with("xlinks"))) %>%
  arrange(desc(xlinksKCL_ratio), .by_group = TRUE) %>%
  mutate(sel = as.factor(seq_along(PeakID)) ) %>%
  summarise(PeakID, ConstructID, xlinksKCL, xlinksKCL_total, xlinksKCL_ratio, sel, n = n(), sum_KCL_ratio = sum(xlinksKCL_ratio)) %>%
  mutate(sum_KCL_ratio_norm = sum_KCL_ratio/ n)

df1 = data.frame(selection = "Best peak ratio", value = dfPre %>% filter(sel == 1) %>% pull(xlinksKCL_ratio))
df2 = data.frame(selection = "Total construct ratio", value = dfPre %>% filter(sel == 1) %>% pull(sum_KCL_ratio_norm))
df = rbind(df1,df2)
ggplot(df, aes(x = value, fill = selection)) +
  geom_density() + 
  facet_wrap(~selection) +
  theme_pub() +
  theme(legend.position = "none") +
  labs(
    x = "Indicated ratio",
    y = "Density"
  ) +
  scale_fill_simpsons()
Distribution of peak and construct strength ratios.

Figure 12: Distribution of peak and construct strength ratios

From the total of 3,775 peaks we observe 1,060 peaks on the selected representative constructs, with a mean, median and maximum of 4.7, 4 and 4 peaks per construct, respectively.

dfPlot = dfPre %>%
  select(ConstructID, n, sum_KCL_ratio, sum_KCL_ratio_norm) %>%
  unique() %>%
  ungroup() %>%
  mutate(n = ifelse(n >= 10, ">=10", n)) %>%
  mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))

colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")


p1 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V1",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio)]"
  )

dfMean = dfPlot %>%
  group_by(n) %>%
  summarise(med = median(sum_KCL_ratio_norm), mean = mean(sum_KCL_ratio_norm), N = n())


p2 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V2",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p3 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(0,0.4)) +
  labs(
    title = "G4 potential of construct - V3",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p4 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.4)) +
  labs(
    title = "G4 potential of construct - V4",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p5 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.4)) +
  labs(
    title = "G4 potential of construct - V5",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p6 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_half_boxplot(outlier.shape = NA) +
  geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.4)) +
  labs(
    title = "G4 potential of construct - V6",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p1 + p2 + p3 + p4 + p5 + p6
Construct G4 potential. The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

Figure 13: Construct G4 potential
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

The peak strength is computed as a ratio of crosslinks within a peak over the total number of crosslinks on the hosting construct. The sum of this ratio from all peaks of a given constructs denotes the G4 potential of the respective construct. This sums is then divided by the number of peaks for normalization and give the total construct ratio.

dfBestRatio = dfPre %>% 
  group_by(ConstructID) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  mutate(n = ifelse(n >= 10, ">=10", n)) %>%
  mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))

colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")


p1 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V1",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio)]"
  )

dfMean = dfBestRatio %>%
  group_by(n) %>%
  summarise(med = median(xlinksKCL_ratio), mean = mean(xlinksKCL_ratio), N = n())


p2 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V2",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p3 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(0,0.55)) +
  labs(
    title = "G4 potential of construct - V3",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p4 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.55)) +
  labs(
    title = "G4 potential of construct - V4",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p5 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.55)) +
  labs(
    title = "G4 potential of construct - V5",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p6 = ggplot(dfBestRatio, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_half_boxplot(outlier.shape = NA) +
  geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.55)) +
  labs(
    title = "G4 potential of construct - V6",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p1 + p2 + p3 + p4 + p5 + p6
Construct G4 potential - based on best peak. The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

Figure 14: Construct G4 potential - based on best peak
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

Finally I export all information for the G4 containing RT-Stop peaks as .xlsx and .rds. The G4Peaks file contains the representative G4 peak for each construct classified as G4. The AllG4Peaks file contains the relative position of all upregulated G4 peaks for each construct classified as G4.

# Match results to all G4 peaks
idx = match(names(AllG4Peaks), dfPre$PeakID)
AllG4Peaks$xlinksKCL_ratio = dfPre$xlinksKCL_ratio[idx]
AllG4Peaks$peaksPosition = sapply(strsplit(names(AllG4Peaks),"_"), `[`, 2)
AllG4Peaks$sum_KCL_ratio_norm = dfPre$sum_KCL_ratio_norm[idx]

# Match results to best G4 peak
# -> Select the strongest peak per construct as representative position 
keep = dfBestRatio %>% pull(PeakID)
G4Peaks = AllG4Peaks[names(AllG4Peaks) %in% keep,]
idx = match(names(G4Peaks), dfBestRatio$PeakID)
G4Peaks$xlinksKCL_ratio = dfBestRatio$xlinksKCL_ratio[idx]
G4Peaks$peaksPosition = sapply(strsplit(names(G4Peaks),"_"), `[`, 2)
G4Peaks$sum_KCL_ratio_norm = dfBestRatio$sum_KCL_ratio_norm[idx]

# Save G4 peak info
resChange$PeakID = rownames(resChange)

df = G4Peaks %>%
  as.data.frame() %>%
  tibble::rownames_to_column("PeakID") %>%
  rename("ConstructID" = "seqnames") %>%
  mutate(RegionID = constructInfo2$Region.ID[match(ConstructID, constructInfo2$Construct.ID)]) %>%
  left_join(y = resChange, by = c("PeakID")) %>%
  select(-c(dir, constructID, sig, peaksPosition)) %>%
  relocate(RegionID, ConstructID, PeakID)

G4Peaks$ConstructID = seqnames(G4Peaks)
G4Peaks$RegionID = df$RegionID[match(df$ConstructID, G4Peaks$ConstructID)]

write.xlsx(df, file = "./data/g4peaklist.xlsx", sheetName = "G4Peaks")
saveRDS(G4Peaks, file = "./data/G4Peaks.rds")
saveRDS(AllG4Peaks, file = "./data/AllG4Peaks.rds")

6.2 No G4 containing constructs

Just like for the G4 containing constructs we assign a representative peak for each of the No G4 containing constructs, computing the same ratio per peak and construct.

# Calculate peak and construct strength 
AllNoG4Peaks = cov[seqnames(cov) %in% rownames(noG4Const)]
mcols(AllNoG4Peaks) = mcols(AllNoG4Peaks)[1:3]

dfNoPre = as.data.frame(AllNoG4Peaks) %>%
  mutate(xlinksKCL = X1_GTP_KCL + X2_GTP_KCL + X3_GTP_KCL) %>%
  tibble::rownames_to_column("PeakID") %>%
  select(PeakID, seqnames, xlinksKCL) %>%
  rename("ConstructID" = "seqnames") %>%
  group_by(ConstructID) %>%
  left_join( y = constCountsNew, by = "ConstructID") %>% 
  mutate(xlinksKCL_total = GTP_KCL_rep1 + GTP_KCL_rep2 + GTP_KCL_rep3) %>%
  mutate(xlinksKCL_ratio = xlinksKCL / xlinksKCL_total) %>%
  select(c(PeakID, ConstructID, starts_with("xlinks"))) %>%
  arrange(desc(xlinksKCL_ratio), .by_group = TRUE) %>%
  mutate(sel = as.factor(seq_along(PeakID)) ) %>%
  summarise(PeakID, ConstructID, xlinksKCL, xlinksKCL_total, xlinksKCL_ratio, sel, n = n(), sum_KCL_ratio = sum(xlinksKCL_ratio)) %>%
  mutate(sum_KCL_ratio_norm = sum_KCL_ratio/ n)
dfPlot = dfNoPre %>%
  select(ConstructID, n, sum_KCL_ratio, sum_KCL_ratio_norm) %>%
  unique() %>%
  ungroup() %>%
  mutate(n = ifelse(n >= 10, ">=10", n)) %>%
  mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))

colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")


p1 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V1",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio)]"
  )

dfMean = dfPlot %>%
  group_by(n) %>%
  summarise(med = median(sum_KCL_ratio_norm), mean = mean(sum_KCL_ratio_norm), N = n())


p2 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V2",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p3 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(0,0.1)) +
  labs(
    title = "G4 potential of construct - V3",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p4 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.1)) +
  labs(
    title = "G4 potential of construct - V4",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p5 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.1)) +
  labs(
    title = "G4 potential of construct - V5",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p6 = ggplot(dfPlot, aes(x = n, y = sum_KCL_ratio_norm, fill = n, color = n)) +
  geom_half_boxplot(outlier.shape = NA) +
  geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.1)) +
  labs(
    title = "G4 potential of construct - V6",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p1 + p2 + p3 + p4 + p5 + p6
No G4 construct potential. The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

Figure 15: No G4 construct potential
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

dfBestRatioNoG4 = dfNoPre %>% 
  group_by(ConstructID) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  mutate(n = ifelse(n >= 10, ">=10", n)) %>%
  mutate(n = factor(n, levels = c(as.character(1:9), ">=10")))

colBright = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))
colDark = c("#151b1e", "#2a363c", "#34434b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b", "#3e525b")

p1 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = c(sapply(1:10, function(x){pal_material(palette = "blue-grey", reverse = TRUE)(10)[x]}))) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V1",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio)]"
  )

dfMean = dfBestRatioNoG4 %>%
  group_by(n) %>%
  summarise(med = median(xlinksKCL_ratio), mean = mean(xlinksKCL_ratio), N = n())

p2 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot() +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  labs(
    title = "G4 potential of construct - V2",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p3 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(0,0.55)) +
  labs(
    title = "G4 potential of construct - V3",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  )

p4 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.55)) +
  labs(
    title = "G4 potential of construct - V4",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p5 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_boxplot(outlier.shape = NA) +
  geom_quasirandom_rast(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.55)) +
  labs(
    title = "G4 potential of construct - V5",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p6 = ggplot(dfBestRatioNoG4, aes(x = n, y = xlinksKCL_ratio, fill = n, color = n)) +
  geom_half_boxplot(outlier.shape = NA) +
  geom_half_point(shape = 21, stroke = 0.3, size = 0.1) +
  geom_line(data = dfMean, aes(x = n, y = med, group = 1), size = 1, color = "#1a1a1a") +
  geom_point(data = dfMean, aes(x = n, y = med, group = 1, fill = n), color = "#1a1a1a", shape = 21, size = 4, stroke = 1.5) +
  theme_pub() +
  theme( legend.position = "none") +
  scale_fill_manual(values = colBright) +
  scale_color_manual(values = colDark) +
  coord_cartesian(ylim = c(-0.01,0.55)) +
  labs(
    title = "G4 potential of construct - V6",
    x = "#Peaks in construct",
    y = "Construct strength [sum(peak strenth ratio) / #peaks]"
  ) +
  geom_text(data = dfMean, aes(x = n, y = 0, label = N), color = "#1a1a1a", size = 2.5)

p1 + p2 + p3 + p4 + p5 + p6
No G4 construct potential - based on best peak. The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

Figure 16: No G4 construct potential - based on best peak
The strength of each peak is given by the ratio of counts in the respective peak over the total counts per construct. The Total potential of a construct is given by the sum of its peaks (divided by the number of peaks).

Finally I export all information for the No G4 containing RT-Stop peaks as .xlsx and .rds. The NoG4Peaks file contains the representative No G4 peak for each construct classified as No G4. The AllNoG4Peaks file contains the relative position of all upregulated No G4 peaks for each construct classified as No G4.

# Match results to all no G4 peaks
idx = match(names(AllNoG4Peaks), dfNoPre$PeakID)
AllNoG4Peaks$xlinksKCL_ratio = dfNoPre$xlinksKCL_ratio[idx]
AllNoG4Peaks$peaksPosition = sapply(strsplit(names(AllNoG4Peaks),"_"), `[`, 2)
AllNoG4Peaks$sum_KCL_ratio_norm = dfNoPre$sum_KCL_ratio_norm[idx]

# Match results to best no G4 peak
# -> Select the strongest peak per construct as representative position 
keepNoG4 = dfBestRatioNoG4 %>% pull(PeakID)
NoG4Peaks = AllNoG4Peaks[names(AllNoG4Peaks) %in% keepNoG4,]
idx = match(names(NoG4Peaks), dfBestRatioNoG4$PeakID)
NoG4Peaks$xlinksKCL_ratio = dfBestRatioNoG4$xlinksKCL_ratio[idx]
NoG4Peaks$peaksPosition = sapply(strsplit(names(NoG4Peaks),"_"), `[`, 2)
NoG4Peaks$sum_KCL_ratio_norm = dfBestRatioNoG4$sum_KCL_ratio_norm[idx]

df = NoG4Peaks %>%
  as.data.frame() %>%
  tibble::rownames_to_column("PeakID") %>%
  rename("ConstructID" = "seqnames") %>%
  mutate(RegionID = constructInfo2$Region.ID[match(ConstructID, constructInfo2$Construct.ID)]) %>%
  left_join(y = resChange, by = c("PeakID")) %>%
  select(-c(dir, constructID, sig, peaksPosition)) %>%
  relocate(RegionID, ConstructID, PeakID)

NoG4Peaks$ConstructID = seqnames(NoG4Peaks)
NoG4Peaks$RegionID = df$RegionID[match(df$ConstructID, NoG4Peaks$ConstructID)]

write.xlsx(df, file = "./data/g4peaklist.xlsx", sheetName = "NoG4Peaks", append = TRUE)
saveRDS(NoG4Peaks, file = "./data/NoG4Peaks.rds")
saveRDS(AllNoG4Peaks, file = "./data/AllNoG4Peaks.rds")

7 Sequence logo plots

Here we explore the G4 sequence composition for the main RT-Stop peaks of constructs with and without G4. The following plots show a combination of logos computed from all peaks or the best peak on a represetnative G4 (and No G4) construct.

getPeakSeq <- function(peak, adjacentNucs, trimRight) {
  r1 = (peak) %>% as.data.frame()
  seq <- constructInfo2$Seq[match(r1$seqnames, constructInfo2$Construct.ID)]
  peakSeq <- substr(seq, start=r1$start-adjacentNucs, stop=r1$start+adjacentNucs-trimRight)
  return(peakSeq)
}
df1 = getPeakSeq(AllG4Peaks, adjacentNucs = 10, trimRight = 9)
df2 = getPeakSeq(AllNoG4Peaks, adjacentNucs = 10, trimRight = 9)

p1 = ggseqlogo(df1) +
  geom_hline(yintercept = 0) +
  ylim(0,2) +
  scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
  labs(title = "G4 peaks")

p2 = ggseqlogo(df2) +
  geom_hline(yintercept = 0) +
  ylim(0,2) +
  scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
  labs(title = "No G4 peaks") 

p1 + p2
Sequence logo plots. All peaks on G4 and no G4 constructs.

Figure 17: Sequence logo plots
All peaks on G4 and no G4 constructs.

df1 = getPeakSeq(G4Peaks, adjacentNucs = 10, trimRight = 9)
df2 = getPeakSeq(NoG4Peaks, adjacentNucs = 10, trimRight = 9)

p1 = ggseqlogo(df1) +
  geom_hline(yintercept = 0) +
  ylim(0,2) +
  scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
  labs(title = "G4 peaks")

p2 = ggseqlogo(df2) +
  geom_hline(yintercept = 0) +
  ylim(0,2) +
  scale_x_continuous(breaks=c(1:12), labels=c(-11:0)) +
  labs(title = "No G4 peaks") 

p1 + p2
Sequence logo plots. Best peaks of G4 and no G4 constructs.

Figure 18: Sequence logo plots
Best peaks of G4 and no G4 constructs.

Peaks are grouped into 20% stepwise bins, where grouping is either done based on the best peak ratio or the total construct ratio. This is again done for all or the best G4 and No G4 peaks.

# Best peak per construct
# -> Split by best peak ratio
q = quantile(G4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(G4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(G4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(G4Peaks, G4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p1 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "Best G4 peaks - Split by best peak ratio",
    ) +
  theme(axis.text=element_text(size=8))

# Best peak per construct
# -> Split by total construct ratio
q = quantile(G4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(G4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(G4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(G4Peaks, G4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p2 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "Best G4 peaks - Split by construct ratio",
    ) +
  theme(axis.text=element_text(size=8))

# All peaks per construct
# -> Split by best peak ratio
q = quantile(AllG4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(AllG4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(AllG4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(AllG4Peaks, AllG4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p3 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "All G4 peaks - Split by best peak ratio",
    ) +
  theme(axis.text=element_text(size=8))

# All peaks per construct
# -> Split by total construct ratio
q = quantile(AllG4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(AllG4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(AllG4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(AllG4Peaks, AllG4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p4 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "All G4 peaks - Split by construct ratio",
    ) +
  theme(axis.text=element_text(size=8))

p1 / p2 / p3 / p4
G4 Peaks split by KCL ratio in 20% bins.

Figure 19: G4 Peaks split by KCL ratio in 20% bins

# Best peak per construct
# -> Split by best peak ratio
q = quantile(NoG4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(NoG4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(NoG4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(NoG4Peaks, NoG4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p1 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "Best no G4 peaks - Split by best peak ratio",
    ) +
  theme(axis.text=element_text(size=8))

# Best peak per construct
# -> Split by total construct ratio
q = quantile(NoG4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(NoG4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(NoG4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(NoG4Peaks, NoG4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p2 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "Best no G4 peaks - Split by construct ratio",
    ) +
  theme(axis.text=element_text(size=8))

# All peaks per construct
# -> Split by best peak ratio
q = quantile(AllNoG4Peaks$xlinksKCL_ratio, probs = seq(0, 1, by = 0.2))
s = findInterval(AllNoG4Peaks$xlinksKCL_ratio, q, all.inside = TRUE)
mcols(AllNoG4Peaks)$GTP_KCL_ratio_bin = s
splitPeaks = split(AllNoG4Peaks, AllNoG4Peaks$GTP_KCL_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p3 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "All no G4 peaks - Split by best peak ratio",
    ) +
  theme(axis.text=element_text(size=8))

# All peaks per construct
# -> Split by total construct ratio
q = quantile(AllNoG4Peaks$sum_KCL_ratio_norm, probs = seq(0, 1, by = 0.2))
s = findInterval(AllNoG4Peaks$sum_KCL_ratio_norm, q, all.inside = TRUE)
mcols(AllNoG4Peaks)$GTP_KCL_sum_ratio_bin = s
splitPeaks = split(AllNoG4Peaks, AllNoG4Peaks$GTP_KCL_sum_ratio_bin)
df = lapply(splitPeaks, function(x){
  getPeakSeq(x, adjacentNucs = 17, trimRight = 16)
})
names(df) = paste0(seq(0,80, by = 20), "-", seq(20,100, by = 20), "%")

p4 = ggseqlogo(df, ncol = 5) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = 2, linetype = "dashed") +
  ylim(0,2) +
  scale_x_continuous(breaks=c(seq(1, 20, by = 2)), labels=c(seq(-19, -1, by = 2))) +
  labs(
    title = "All no G4 peaks - Split by construct ratio",
    ) +
  theme(axis.text=element_text(size=8))

p1 / p2 / p3 / p4
No G4 Peaks split by KCL ratio in 20% bins.

Figure 20: No G4 Peaks split by KCL ratio in 20% bins

8 Final exports

export_G4Constructs = G4Const %>%
  tibble::rownames_to_column("Construct.ID") %>%
  select(Construct.ID, Region.ID) %>%
  mutate(regulationGroup = constructInfo2$statusCombined[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
  mutate(regulationGroupDetail = constructInfo2$status[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
  mutate(strongestPeakStrengthRatio = G4Peaks$xlinksKCL_ratio[match(Construct.ID, G4Peaks$ConstructID)]) %>% 
  mutate(constructStrengthRatio = G4Peaks$sum_KCL_ratio_norm[match(Construct.ID, G4Peaks$ConstructID)])
head(export_G4Constructs)
##   Construct.ID          Region.ID regulationGroup regulationGroupDetail
## 1        00003 non-regulated.1087              G4        Significant-Up
## 2        00011   non-regulated.82              G4        Significant-Up
## 3        00026     regulated.1137              G4     Significant-Mixed
## 4        00054  non-regulated.510              G4        Significant-Up
## 5        00076  non-regulated.111              G4        Significant-Up
## 6        00086  non-regulated.361              G4     Significant-Mixed
##   strongestPeakStrengthRatio constructStrengthRatio
## 1                 0.12029162             0.05847509
## 2                 0.43892694             0.11887367
## 3                 0.55917513             0.15376569
## 4                 0.09424872             0.09424872
## 5                 0.04566585             0.04566585
## 6                 0.11026400             0.11026400
export_NoG4Constructs = noG4Const %>%
  tibble::rownames_to_column("Construct.ID") %>%
  select(Construct.ID, Region.ID) %>%
  mutate(regulationGroup = constructInfo2$statusCombined[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
  mutate(regulationGroupDetail = constructInfo2$status[match(Construct.ID, constructInfo2$Construct.ID)]) %>%
  mutate(strongestPeakStrengthRatio = NoG4Peaks$xlinksKCL_ratio[match(Construct.ID, NoG4Peaks$ConstructID)]) %>% 
  mutate(constructStrengthRatio = NoG4Peaks$sum_KCL_ratio_norm[match(Construct.ID, NoG4Peaks$ConstructID)])
head(export_NoG4Constructs)
##   Construct.ID          Region.ID regulationGroup regulationGroupDetail
## 1        00013     regulated.1091           No G4       Not significant
## 2        00024     regulated.1614           No G4      Significant-Down
## 3        00030 non-regulated.1125           No G4       Not significant
## 4        00040      regulated.217           No G4       Not significant
## 5        00072      regulated.755           No G4       Not significant
## 6        00075 non-regulated.1200           No G4       Not significant
##   strongestPeakStrengthRatio constructStrengthRatio
## 1                0.012107531            0.012107531
## 2                0.012225067            0.010971214
## 3                0.006931074            0.006931074
## 4                0.317385445            0.146731806
## 5                0.012882448            0.012882448
## 6                0.040380048            0.040380048
constructClassification = rbind.data.frame(export_G4Constructs, export_NoG4Constructs)
saveRDS(constructClassification, file = "./data/constructClassification.rds")

9 Session Information

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] janitor_2.1.0               stringr_1.4.1              
##  [3] ggseqlogo_0.1               xlsx_0.6.5                 
##  [5] BindingSiteFinder_1.3.8     ggnewscale_0.4.8           
##  [7] ggrastr_1.0.1               gghalves_0.1.3             
##  [9] DESeq2_1.37.6               SummarizedExperiment_1.27.3
## [11] MatrixGenerics_1.9.1        matrixStats_0.62.0         
## [13] ggsci_2.9                   tidyr_1.2.1                
## [15] ggpointdensity_0.1.0        ggforce_0.4.1              
## [17] patchwork_1.1.2             ggtext_0.1.2               
## [19] forcats_0.5.2               ComplexHeatmap_2.13.1      
## [21] viridis_0.6.2               viridisLite_0.4.1          
## [23] gridExtra_2.3               ggrepel_0.9.1              
## [25] knitr_1.40                  kableExtra_1.3.4           
## [27] GenomicFeatures_1.49.7      UpSetR_1.4.0               
## [29] reshape2_1.4.4              dplyr_1.0.10               
## [31] AnnotationDbi_1.59.1        Biobase_2.57.1             
## [33] ggplot2_3.3.6               rtracklayer_1.57.0         
## [35] GenomicRanges_1.49.1        GenomeInfoDb_1.33.10       
## [37] IRanges_2.31.2              S4Vectors_0.35.4           
## [39] BiocGenerics_0.43.4         BiocStyle_2.25.0           
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2               tidyselect_1.2.0         RSQLite_2.2.18          
##   [4] htmlwidgets_1.5.4        BiocParallel_1.31.13     munsell_0.5.0           
##   [7] codetools_0.2-18         interp_1.1-3             withr_2.5.0             
##  [10] colorspace_2.0-3         filelock_1.0.2           highr_0.9               
##  [13] rstudioapi_0.14          rJava_1.0-6              labeling_0.4.2          
##  [16] GenomeInfoDbData_1.2.9   mixsqp_0.3-43            polyclip_1.10-0         
##  [19] bit64_4.0.5              farver_2.1.1             vctrs_0.5.1             
##  [22] generics_0.1.3           xfun_0.33                biovizBase_1.45.0       
##  [25] BiocFileCache_2.5.2      R6_2.5.1                 doParallel_1.0.17       
##  [28] ggbeeswarm_0.7.0.9000    clue_0.3-61              invgamma_1.1            
##  [31] locfit_1.5-9.6           AnnotationFilter_1.21.0  bitops_1.0-7            
##  [34] cachem_1.0.6             DelayedArray_0.23.2      assertthat_0.2.1        
##  [37] BiocIO_1.7.1             scales_1.2.1             nnet_7.3-18             
##  [40] beeswarm_0.4.0           gtable_0.3.1             Cairo_1.6-0             
##  [43] ensembldb_2.21.5         rlang_1.0.6              genefilter_1.79.0       
##  [46] systemfonts_1.0.4        GlobalOptions_0.1.2      splines_4.2.1           
##  [49] lazyeval_0.2.2           dichromat_2.0-0.1        checkmate_2.1.0         
##  [52] BiocManager_1.30.18      yaml_2.3.5               backports_1.4.1         
##  [55] Hmisc_4.7-1              gridtext_0.1.5           tools_4.2.1             
##  [58] bookdown_0.29            ellipsis_0.3.2           jquerylib_0.1.4         
##  [61] RColorBrewer_1.1-3       Rcpp_1.0.9               plyr_1.8.7              
##  [64] base64enc_0.1-3          progress_1.2.2           zlibbioc_1.43.0         
##  [67] purrr_0.3.5              RCurl_1.98-1.9           prettyunits_1.1.1       
##  [70] rpart_4.1.16             deldir_1.0-6             GetoptLong_1.0.5        
##  [73] ashr_2.2-54              cluster_2.1.4            magrittr_2.0.3          
##  [76] data.table_1.14.2        circlize_0.4.15          truncnorm_1.0-8         
##  [79] SQUAREM_2021.1           ProtGenerics_1.29.1      xlsxjars_0.6.1          
##  [82] hms_1.1.2                evaluate_0.17            xtable_1.8-4            
##  [85] XML_3.99-0.11            jpeg_0.1-9               shape_1.4.6             
##  [88] compiler_4.2.1           biomaRt_2.53.3           tibble_3.1.8            
##  [91] crayon_1.5.2             htmltools_0.5.3          Formula_1.2-4           
##  [94] geneplotter_1.75.0       lubridate_1.8.0          DBI_1.1.3               
##  [97] tweenr_2.0.2             dbplyr_2.2.1             MASS_7.3-58.1           
## [100] rappdirs_0.3.3           Matrix_1.5-1             cli_3.4.1               
## [103] parallel_4.2.1           Gviz_1.41.1              pkgconfig_2.0.3         
## [106] GenomicAlignments_1.33.1 foreign_0.8-83           xml2_1.3.3              
## [109] foreach_1.5.2            svglite_2.1.0            annotate_1.75.0         
## [112] vipor_0.4.5              bslib_0.4.0              webshot_0.5.4           
## [115] XVector_0.37.1           rvest_1.0.3              snakecase_0.11.0        
## [118] VariantAnnotation_1.43.3 digest_0.6.29            Biostrings_2.65.6       
## [121] rmarkdown_2.17           htmlTable_2.4.1          restfulr_0.0.15         
## [124] curl_4.3.3               Rsamtools_2.13.4         rjson_0.2.21            
## [127] lifecycle_1.0.3          jsonlite_1.8.2           BSgenome_1.65.2         
## [130] fansi_1.0.3              pillar_1.8.1             lattice_0.20-45         
## [133] KEGGREST_1.37.3          fastmap_1.1.0            httr_1.4.4              
## [136] survival_3.4-0           glue_1.6.2               png_0.1-7               
## [139] iterators_1.0.14         bit_4.0.4                stringi_1.7.8           
## [142] sass_0.4.2               blob_1.2.3               latticeExtra_0.6-30     
## [145] memoise_2.0.1            irlba_2.3.5.1